#DOJScraper.py
import urllib
import re 
import os
import csv
import numpy
from threading import Thread


## Number of pages of links
npages = 33 

# Threading global
nthreads = 10


# Make a list of date, url combos
urllist = []
print("Collecting Links by Page: " + str(npages) + " Total Pages")

def collectlinks(pagelist):
	for i in pagelist:
		completed.append(i)
		# Search URL based on index
		url = "https://www.justice.gov/news?sort=field_pr_date&order=desc&keys=false%20claims%20act&items_per_page=50&f%5B0%5D=type%3Apress_release&page=" + str(i)
	#
		site = urllib.urlopen(url).read()
	#
		# Grab links
		links = re.findall('<a href="/opa/pr/(.+?)"', site, re.DOTALL)
	#	
		if(i != npages - 1):
			try:
				assert(len(links) == 50)
			except: 
				raise ValueError("error page on page " + str(i)) 
	#		
		for link in links:
			full = "https://www.justice.gov/opa/pr/" + link
			urllist.append(full)


pages = range(npages)
pagelists = numpy.array_split(pages,nthreads)
threads = []
completed = [] ## use to test that threading got it all

for j in range(nthreads):
    t = Thread(target=collectlinks, args=(pagelists[j].tolist(),))
    t.start()
    threads.append(t)

for t in threads:
	t.join()

## Check completion of all pages in threads
completed.sort()
assert(completed== range(npages))

#	
print("Scraping " + str(len(urllist)) + " Links")

def scrape(pagelist):
	for i in range(len(pagelist)):
		page = pagelist[i]
#
		# Read in page 
		pr = urllib.urlopen(page).read()
#
		# Find date
		date = re.findall('<span class="date-display-single".+?>(.+?)</span>', pr)
		assert(len(date) == 1)
		date = date[0]
		date = date.replace(",", " ")
#
		# Find text
		text = re.findall('<article.+?>(.+?)</article>', pr,re.DOTALL)
		assert(len(text) == 1)
		text = text[0]
#		
		# Clean it up
#
		text = re.sub("<.+?>", '', text)
		text = re.sub("<.+?>", '', text)
#
		text = text.replace("\n", " ")
		text = text.replace("\r", " ")
		text = text.replace("\t", " ")
		text = text.replace(",", " ")
		text = text.replace("&nbsp", " ")
#
		## We will handle unicode after scraping to make sure it doesn't break 
#
		# Bring together multiple whitespace
		text = re.sub("( )+", " ", text)
#		
		# Append to list
		PRList.append([date, text])
		print(len(PRList))
#
#
PRList = []
urllists = numpy.array_split(urllist,nthreads)
threads = []
for i in range(nthreads):
    t = Thread(target=scrape, args=(urllists[i].tolist(),))
    t.start()
    threads.append(t)

for t in threads:
	t.join()


# Write out results 
with open("PRList.csv","w") as f:
	wr = csv.writer(f)
	wr.writerows(PRList)




